Vignette for ArchR: https://www.archrproject.com/bookdown/getting-started-with-archr.html
MEF_IFNb6h_10xTn5_CellRangerATAC_fragments.tsv.gz MEF_IFNb6h_Tn5hc_30perc_CellRangerATAC_fragments.tsv.gz MEF_IFNb6h_Tn5hc_CellRangerATAC_fragments.tsv.gz 20230912_MEF_IFNb6h_UMAP.ipynb
### Libraries
library(magrittr)
library(ArchR)
library(ggpubr)
library(purrr)
library(Seurat)
library(parallel)
library(scDblFinder)
library(qs)
source("plotCellQuality.R")
source("plotUmapQC.R")
source("plotUmapClusters.R")
source("plotUmaps.R")
### Set ArchR parameters.
addArchRThreads(threads = 30) # number of parallel threads
addArchRGenome("mm10") # genome and gene annotation ("hg19", "hg38", "mm9", "mm10")
### Set HDF5 environment variables to prevent HDF5 error later
Sys.setenv(HDF5_USE_FILE_LOCKING=FALSE)
Sys.setenv(RHDF5_USE_FILE_LOCKING=FALSE)
### Samples
samples <- c("MEF_IFNb6h_10xTn5", "MEF_IFNb6h_Tn5hc_30perc", "MEF_IFNb6h_Tn5hc")
### Sample palette
sample_palette <- c("#68c6a4", "#f2b531", "#ed5958")
names(sample_palette) <- samples
### Data
inputFiles <- c("MEF_IFNb6h_10xTn5_CellRangerATAC_fragments.tsv.gz",
"MEF_IFNb6h_Tn5hc_30perc_CellRangerATAC_fragments.tsv.gz",
"MEF_IFNb6h_Tn5hc_CellRangerATAC_fragments.tsv.gz")
names(inputFiles) <- samples
ArrowFiles <- createArrowFiles(
inputFiles = inputFiles,
sampleNames = names(inputFiles),
minTSS = 0, #Dont set this too high because you can always increase later
minFrags = 100,
maxFrags = 5000000,
addTileMat = TRUE,
addGeneScoreMat = FALSE,
verbose = FALSE,
subThreading = FALSE
)
names(ArrowFiles) <- gsub("\\.arrow", "", ArrowFiles)
proj_list <- list()
for (sample in samples){
proj_list[[sample]] <- ArchRProject(
ArrowFiles = ArrowFiles[[sample]],
outputDirectory = paste0("ArchRProj_", sample),
copyArrows = TRUE #This is recommened so that if you modify the Arrow files you have an original copy for later usage.
)
}
log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Samples, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1),
legend.title = element_text(size=28), legend.text = element_text(size=28)) +
scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Samples, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1),
legend.title = element_text(size=28), legend.text = element_text(size=28)) +
scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=32, repr.plot.height=20)
ggarrange(log10frags_samples, tss_samples, ncol = 2)
minFrag <- list(c(3, 3.8), c(3, 3.9), c(4, 4.3))
names(minFrag) <- samples
minTSS <- list(c(4, 5), c(4, 12), c(4, 12))
names(minTSS) <- samples
p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, groups = "Samples",
set_xlim = c(2, 6.1), set_ylim = c(0, 50),
cutoff_feature2 = minTSS[[sample]], cutoff_feature1 = minFrag[[sample]])})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=26)
ggarrange(plotlist = p_list, ncol = 2, nrow = 2)
minFrag <- c(3.8, 3.9, 4.3)
names(minFrag) <- samples
proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[which(getCellColData(proj_list[[x]], select = "log10(nFrags)")[,1] >= minFrag[x])], ]})
names(proj_list) <- samples
minTSS <- c(5, 12, 12)
names(minTSS) <- samples
proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[which(getCellColData(proj_list[[x]], select = "TSSEnrichment")[,1] > minTSS[x])], ]})
names(proj_list) <- samples
log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Samples, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1),
legend.title = element_text(size=28), legend.text = element_text(size=28)) +
scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Samples, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1),
legend.title = element_text(size=28), legend.text = element_text(size=28)) +
scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=32, repr.plot.height=20)
ggarrange(log10frags_samples, tss_samples, ncol = 2)
p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, groups = "Samples",
set_xlim = c(2, 6.1), set_ylim = c(0, 50),
cutoff_feature2 = minTSS[[sample]], cutoff_feature1 = minFrag[[sample]])})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=26)
ggarrange(plotlist = p_list, ncol = 2, nrow = 2)
proj_list <- lapply(proj_list, function(x){addIterativeLSI(
ArchRProj = x,
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = 1:30,
force = TRUE
)})
IterativeLSI_list <- lapply(proj_list, function(x){getReducedDims(x, "IterativeLSI", returnMatrix = FALSE)})
options(repr.plot.width=21, repr.plot.height=5)
par(mfrow=c(1, 3))
for (x in samples){
plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', main=x, ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1)
plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1, ylim=c(0,1500))
plot(IterativeLSI_list[[x]]$corToDepth$none, bty='l', xlab='LSI components', ylab='Correlation to sequencing depth', pch=16, cex.lab=1.2, cex.axis=1.1)
}
LSIcomponents <- list(2:15, 2:16, 2:15)
names(LSIcomponents) <- samples
proj_list <- lapply(names(proj_list), function(x){addIterativeLSI(
ArchRProj = proj_list[[x]],
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = LSIcomponents[[x]],
force = TRUE
)})
names(proj_list) <- samples
proj_list <- lapply(proj_list, function(x){addUMAP(
ArchRProj = x,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)})
umap_list <- lapply(names(proj_list), function(x){plotUmapQC(proj_list[[x]], "UMAP", "Sample", x, doubletScore=FALSE, coloringPalette=sample_palette)})
names(umap_list) <- names(proj_list)
options(repr.plot.width=25, repr.plot.height=60)
lapply(umap_list, function(x){ggpubr::ggarrange(plotlist=x, ncol=3, nrow=2)}) %>% ggpubr::ggarrange(plotlist=., ncol=1, nrow=3)
amulet_res <- lapply(samples, function(sample){res <- amulet(inputFiles[sample],
barcodes = rownames(proj_list[[sample]]@cellColData) %>% gsub(paste0(sample, "#"), "", .),
regionsToExclude = GRanges(c("M","chrM","MT","X","Y","chrX","chrY"), IRanges(1L, width=10^8)), # excluding repeats, as well as sex and mitochondrial chromosomes
uniqueFrags = TRUE, # only use unique fragments
maxFragSize = 1000L, # maximum fragment size to consider
removeHighOverlapSites = TRUE); # remove sites that have more than two reads in unexpectedly many cells
rownames(res) <- paste0(sample, "#", rownames(res));
colnames(res) <- paste0("Amulet_", colnames(res));
qsave(res, paste0("Amulet_DoubletDetection_", sample, ".qs"));
return(res)})
for (sample in samples){
proj_list[[sample]]@cellColData <- cbind(proj_list[[sample]]@cellColData, amulet_res[[sample]][match(rownames(proj_list[[sample]]@cellColData), rownames(amulet_res[[sample]])),])
proj_list[[sample]]@cellColData$Amulet_doublet <- proj_list[[sample]]@cellColData$Amulet_q.value <= 0.05
}
plot_list <- list()
for (sample in samples){
plot_list[["1"]][[sample]] <- ggplot(as.data.frame(proj_list[[sample]]@cellColData), aes(x = log10(Amulet_nFrags), y = log10(Amulet_q.value))) + ggpointdensity::geom_pointdensity() +
ggtitle(sample) +
ggpubr::stat_cor(p.accuracy = 0.001, r.accuracy = 0.01, method = "spearman", size = 5) +
theme_minimal() + theme(text = element_text(size = 15))
plot_list[["2"]][[sample]] <- ggplot(as.data.frame(proj_list[[sample]]@cellColData), aes(x = log10(Amulet_nFrags), y = log10(Amulet_p.value))) + ggpointdensity::geom_pointdensity() +
ggtitle(sample) +
ggpubr::stat_cor(p.accuracy = 0.001, r.accuracy = 0.01, method = "spearman", size = 5) +
theme_minimal() + theme(text = element_text(size = 15))
}
options(repr.plot.width=18, repr.plot.height=5)
ggarrange(plotlist = plot_list[["1"]], ncol = 3)
ggarrange(plotlist = plot_list[["2"]], ncol = 3)
umap_list <- list()
for (sample in samples){
umap_list[[sample]] <- plotEmbedding(proj_list[[sample]], embedding = "UMAP",
colorBy = "cellColData", name = "Amulet_doublet",
pal = c("FALSE" = "grey", "TRUE" = "darkred"), size = 0.5)
}
options(repr.plot.width=18, repr.plot.height=7)
ggarrange(plotlist = umap_list, ncol=3, nrow=1)
for (sample in samples){
print(sample)
print(table(proj_list[[sample]]@cellColData$Amulet_doublet))
print(round(sum(proj_list[[sample]]@cellColData$Amulet_doublet) / nrow(proj_list[[sample]]@cellColData), 2))
}
proj_list <- lapply(names(proj_list), function(x){proj_list[[x]][proj_list[[x]]$cellNames[!proj_list[[x]]$Amulet_doublet]]})
names(proj_list) <- samples
log10frags_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Samples, levels = samples), log10nFrags = log10(x$nFrags))}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = log10nFrags)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1),
legend.title = element_text(size=28), legend.text = element_text(size=28)) +
scale_fill_manual(values = sample_palette) + ggtitle("Number of fragments") + xlab("") + ylab("log10(nFrags)")
tss_samples <- ggplot(lapply(proj_list, function(x){data.frame(AnalysesGroups = factor(x$Samples, levels = samples), tss = x$TSSEnrichment)}) %>% do.call(rbind, .), aes(x = AnalysesGroups, y = tss)) + geom_violin(aes(fill = AnalysesGroups), alpha = 0.8) + geom_boxplot(alpha = 0.001) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=30), axis.title = element_text(size=28), axis.text = element_text(size=28), axis.text.x=element_text(angle=90, vjust=1, hjust=1),
legend.title = element_text(size=28), legend.text = element_text(size=28)) +
scale_fill_manual(values = sample_palette) + ggtitle("TSS enrichment scores") + xlab("") + ylab("TSS enrichment score")
options(repr.plot.width=32, repr.plot.height=20)
ggarrange(log10frags_samples, tss_samples, ncol = 2)
p_list <- lapply(samples, function(sample){plotCellQuality(proj_list[[sample]], sample, groups = "Samples",
set_xlim = c(2, 6.1), set_ylim = c(0, 50),
cutoff_feature2 = minTSS[[sample]], cutoff_feature1 = minFrag[[sample]])})
names(p_list) <- samples
options(repr.plot.width=30, repr.plot.height=26)
ggarrange(plotlist = p_list, ncol = 2, nrow = 2)
proj_list <- lapply(proj_list, function(x){addIterativeLSI(
ArchRProj = x,
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = 1:30,
force = TRUE
)})
IterativeLSI_list <- lapply(proj_list, function(x){getReducedDims(x, "IterativeLSI", returnMatrix = FALSE)})
options(repr.plot.width=21, repr.plot.height=5)
par(mfrow=c(1, 3))
for (x in samples){
plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', main=x, ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1)
plot(IterativeLSI_list[[x]]$svd$d, bty='l', xlab='LSI components', ylab='SVD deviation', pch=16, cex.lab=1.2, cex.axis=1.1, ylim=c(0,1500))
plot(IterativeLSI_list[[x]]$corToDepth$none, bty='l', xlab='LSI components', ylab='Correlation to sequencing depth', pch=16, cex.lab=1.2, cex.axis=1.1)
}
LSIcomponents <- list(2:10, 2:10, 2:12, 2:12)
names(LSIcomponents) <- samples
proj_list <- lapply(names(proj_list), function(x){addIterativeLSI(
ArchRProj = proj_list[[x]],
useMatrix = "TileMatrix",
name = "IterativeLSI",
iterations = 2,
clusterParams = list( #See Seurat::FindClusters
resolution = c(0.2),
sampleCells = 10000,
n.start = 10
),
varFeatures = 25000,
dimsToUse = LSIcomponents[[x]],
force = TRUE
)})
names(proj_list) <- samples
proj_list <- lapply(proj_list, function(x){addUMAP(
ArchRProj = x,
reducedDims = "IterativeLSI",
name = "UMAP",
nNeighbors = 30,
minDist = 0.5,
metric = "cosine",
force = TRUE
)})
umap_list <- lapply(names(proj_list), function(x){plotUmapQC(proj_list[[x]], "UMAP", "Sample", x, doubletScore=FALSE, coloringPalette=sample_palette)})
names(umap_list) <- names(proj_list)
options(repr.plot.width=25, repr.plot.height=60)
lapply(umap_list, function(x){ggpubr::ggarrange(plotlist=x, ncol=3, nrow=2)}) %>% ggpubr::ggarrange(plotlist=., ncol=1, nrow=3)
umap_clusters_filtered <- lapply(proj_list, function(x){plotUmapClusters(x, reducedDims="IterativeLSI",
embedding="UMAP", resolutions = c(0.05, 0.1, 0.2))})
options(repr.plot.width=28, repr.plot.height=30)
lapply(umap_clusters_filtered, function(x){ggpubr::ggarrange(plotlist=x, ncol=3, nrow=1)}) %>% ggpubr::ggarrange(plotlist=., ncol=1, nrow=3)
proj_list <- lapply(names(proj_list), function(x){addClusters(proj_list[[x]],
reducedDims = "IterativeLSI", dimsToUse = LSIcomponents[[x]],
method = "Seurat", name = "Clusters", resolution = 0.1)})
proj_list <- lapply(proj_list, function(x){x$Sample_cluster <- paste(x$Sample, x$Clusters, sep="_"); print(table(x$Sample_cluster)); return(x)})
names(proj_list) <- samples
umap_list <- lapply(proj_list, function(x){plotUmaps(x, embeddings = c("UMAP"), coloringLayer = "Sample_cluster")[[1]]})
options(repr.plot.width=20, repr.plot.height=20)
ggarrange(plotlist = umap_list, ncol=2, nrow=2)
### Save ArchRProject
for (sample in samples){
saveArchRProject(ArchRProj = proj_list[[sample]], outputDirectory = paste0("ArchRProj_", sample), load = FALSE)
}
#
Isabelle Seufert
Division of Chromatin Networks, DKFZ
12.09.2023
sessionInfo()